0: Preparation

Defining the input and output files

# Clean the working environment
rm(list = ls())

knitr::opts_chunk$set(echo = TRUE)

# empirical_species <- "German Shepherd"

empirical_dog_breed <- Sys.getenv("empirical_dog_breed")

empirical_species <- paste(toupper(substring(unlist(strsplit(empirical_dog_breed, "_")), 1, 1)),
                             substring(unlist(strsplit(empirical_dog_breed, "_")), 2),
                             sep = "", collapse = " ")
# empirical_species <- "Labrador Retriever"
# document_title <- paste(empirical_species,"Pipeline Results - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"55 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"40 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

N_e_bottleneck <- as.numeric(Sys.getenv("N_e_bottleneck"))
n_simulated_generations_breed_formation <- as.numeric(Sys.getenv("n_simulated_generations_breed_formation"))
reference_population_for_snp_chip <- Sys.getenv("reference_population_for_snp_chip")

document_title <- paste0(empirical_species," ",n_simulated_generations_breed_formation," Gen Breed Formation - SNP chip: ",reference_population_for_snp_chip," - ",N_e_bottleneck," N_e bottleneck scenario")

document_sub <- Sys.getenv("document_sub_title") 


# # 
# # MAF_pruning_used <- TRUE
# MAF_pruning_used <- FALSE
# 
# N_e <- 340
# # document_sub <- paste("MAF 0.05 used, but only for H_E-computation, N_e=", N_e)
# # document_sub <- paste("MAF 0.01 used, but only for H_E-computation, N_e=", N_e)
# 
# 
# 
# 
# 
# if (MAF_pruning_used == FALSE) {
#   document_sub <- paste("No MAF-based pruning used, N_e =", N_e)
# 
# 
# } else {
#   document_sub <- paste("MAF-based pruning used, N_e =", N_e)
# }

############################################ 
# Parameters used for displaying the result
############################################ 
ROH_frequency_decimals <- 1
H_e_values_decimals <- 3
F_ROH_values_decimals <- 3
Window_lengths_decimals <- 2

####################################  
# Defining the input files
#################################### 

Selection_strength_test_dir <- Sys.getenv("Selection_strength_test_dir")

Sweep_test_dir <-  Sys.getenv("Sweep_test_dir")

############### 
## Empirical ###
###############

### ROH hotspots ###
Empirical_data_ROH_hotspots_dir  <- Sys.getenv("Empirical_data_ROH_hotspots_dir")
Empirical_data_autosome_ROH_freq_dir <- Sys.getenv("Empirical_data_autosome_ROH_freq_dir")
### Inbreeding coefficient ###

Empirical_data_F_ROH_dir  <- Sys.getenv("Empirical_data_F_ROH_dir")

### Expected Heterozygosity distribution ###
Empirical_data_H_e_dir <- Sys.getenv("Empirical_data_H_e_dir")

############### 
## Simulated ###
###############

### Causative variant ###
variant_freq_plots_dir <- Sys.getenv("variant_freq_plots_dir")
variant_position_dir <- Sys.getenv("variant_position_dir")
pruned_replicates_count_dir <- Sys.getenv("pruned_replicates_count_dir")

Selection_causative_variant_windows_dir <- Sys.getenv("Selection_causative_variant_windows_dir")
causative_variant_windows_H_e_dir <- Sys.getenv("causative_variant_windows_H_e_dir")

### ROH hotspots ###
Neutral_model_ROH_hotspots_dir <- Sys.getenv("Neutral_model_ROH_hotspots_dir")
Neutral_model_autosome_ROH_freq_dir <- Sys.getenv("Neutral_model_autosome_ROH_freq_dir")

Selection_model_ROH_hotspots_dir  <- Sys.getenv("Selection_model_ROH_hotspots_dir")
Selection_model_autosome_ROH_freq_dir <- Sys.getenv("Selection_model_autosome_ROH_freq_dir")

### Inbreeding coefficient ###
Neutral_model_F_ROH_dir  <- Sys.getenv("Neutral_model_F_ROH_dir")
Selection_model_F_ROH_dir  <- Sys.getenv("Selection_model_F_ROH_dir")

### Expected Heterozygosity distribution ###
Neutral_model_H_e_dir <- Sys.getenv("Neutral_model_H_e_dir")
Selection_model_H_e_dir <- Sys.getenv("Selection_model_H_e_dir")


histogram_line_sizes <- 3
empirical_data_color <- "darkgreen"
neutral_model_color <- "blue" 
selection_model_color <- "purple"

output_dir <- Sys.getenv("Pipeline_results_output_dir") 


# Sys.getenv()  

# # Set the working directory for notebook chunks
# knitr::opts_knit$set(root.dir = output_dir_sweep_test)

Loading libraries

if (!require(knitr)) install.packages("knitr", dependencies = TRUE)
## Loading required package: knitr
## Warning: package 'knitr' was built under R version 4.3.2
library(knitr)

if (!require(ggplot2)) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
library(ggplot2)

if (!require(scatterplot3d)) install.packages("scatterplot3d", dependencies = TRUE)
## Loading required package: scatterplot3d
library(scatterplot3d) # For generating the 3d plots 

if (!require(RColorBrewer)) install.packages("RColorBrewer", dependencies = TRUE)
## Loading required package: RColorBrewer
library(RColorBrewer) # For generating a color palette

Latex formatting function

Causative variant

Variant fixation

Fixation probability

Fixation time

# Function to determine the number of rows until fixation is reached
time_to_fixation <- function(data, threshold = 0.9) {
  # Find the index of the first row where the allele frequency is 0.9 or higher
  fixation_index <- which(data$allele_frequency >= threshold)[1]
  
  # Return the number of rows until fixation is reached
  return(fixation_index - 1)
}

Summary - Variant fixation

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_variant_Fixation_summary.txt"
Selection_coefficient Fixation_probability Avg_Fixation_time Min_fixation_time Max_fixation_time
s=0.1 27.4 56 41 72
s=0.2 74.1 32 16 50
s=0.3 80.0 21 12 32
s=0.4 71.4 15 10 21
s=0.5 76.9 12 9 16
s=0.6 69.0 10 7 13
s=0.7 71.4 8 6 11
s=0.8 87.0 7 5 8

Causative variant windows

Variant position

Window lengths

Causative variant windows

Summary - Causative variant windows

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_variant_windows_summary.txt"
Selection_coefficient Avg_Length_Mb Avg_window_freq Min_freq Max_freq Avg_freq_variant_100k_window
4 s=0.4 2.48 71.6 16.2 100.0 72.7
1 s=0.1 2.49 64.2 13.0 97.3 66.5
2 s=0.2 2.56 78.2 13.2 100.0 80.3
3 s=0.3 2.85 83.9 22.2 100.0 85.3
6 s=0.6 2.96 86.7 27.0 100.0 87.4
5 s=0.5 3.24 89.1 28.6 100.0 90.9
8 s=0.8 3.62 77.0 13.0 100.0 77.1
7 s=0.7 4.08 75.2 11.4 100.0 75.8

Standard Error Confidence interval function

Confidence level <=> konfidensgrad

# Function to calculate standard error and its confidence interval
standard_error_confidence_interval_fun <- function(observed_data, confidence_level = 0.95) {
  # if ( nrow(observed_data) > 1 ) {
  
  if ( length(observed_data) > 1 ) {
      
  # Calculate standard error
  n <- length(observed_data)
  standard_deviation <- sd(observed_data)
  standard_error <- standard_deviation / sqrt(n - 1)
  
  # Calculate confidence interval based on standard error

  alpha <- (1 - confidence_level) / 2
  margin_of_error <- qnorm(1 - alpha) * standard_error
  mean_estimate <- mean(observed_data)
  # Calculate the percentiles for the confidence interval
  confidence_interval_lower_bound <- mean_estimate - margin_of_error # 2.5th percentile (2σ)
  confidence_interval_upper_bound <- mean_estimate + margin_of_error # 97.5th percentile (2σ)
  
  # Return confidence interval
  return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))

    
  } else {
    return(c(NA,NA)) 
    }
  
  
} 





# # Function to calculate bootstrap confidence intervals
# standard_error_confidence_interval_fun <- function(observed_data, n_bootstraps = 1000, confidence_level = 0.95) {
#   
#   # Function to calculate the mean for each bootstrap sample
#   compute_bootstrap_mean_fun <- function(observed_data_urn) {
#     bootstrap_dataset <- sample(observed_data_urn, replace = TRUE)
#     bootstrap_estimate <- mean(bootstrap_dataset)
#     return(bootstrap_estimate)
#   }
#   
#   # Perform bootstrap resampling
#   bootstrap_point_estimates <- replicate(n_bootstraps, compute_bootstrap_mean_fun(observed_data))
#   
#   # Calculate the percentiles for the confidence interval
#   alpha <- (1 - confidence_level) / 2
#   confidence_interval_lower_bound <- quantile(bootstrap_point_estimates, alpha) # 2.5th percentile
#   confidence_interval_upper_bound <- quantile(bootstrap_point_estimates, 1 - alpha) # 97.5th percentile
#   
#   # Return the confidence interval
#   return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
# }

1: ROH-Frequency

1.1 : Autosome ROH-frequencies

1.1.1 : Empirical - ROH frequency

1.1.2: Neutral Model - ROH frequency

1.1.3: Selection Model

1.1.4 Summary - ROH-hotspot threshold

## ROH-hotspot selection testing results://n
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/ROH-hotspot_threshold_comparison.txt"
Model Avg_ROH_hotspot_threshold Lower_CI Upper_CI
Neutral 55.1 49.7 60.5
Empirical 55.4 NA NA
s=0.1 69.1 61.9 76.2
s=0.2 81.9 73.6 90.1
s=0.4 82.2 73.0 91.5
s=0.8 85.3 76.9 93.7
s=0.7 85.8 77.9 93.6
s=0.3 88.6 80.9 96.3
s=0.6 90.9 82.8 99.0
s=0.5 92.0 84.0 100.0

1.2 ROH-hotspots - ROH Frequency and Length

2: Inbreeding coefficient

2.1 Empirical data

## Overall Average Avg_F_ROH for  labrador_retriever : 0.2333818 //n

2.2 Neutral Model

## Point estimate F_ROH across all 20 simulations: 0.2395518 //n
## [1] "Bootstrap 95% Confidence Interval: [0.226836581202471, 0.252266937716447]"

2.3 Selection Model

## Point estimate F_ROH across all 20 simulations for  selection_model_s01_chr3 : 0.2624342 //n[1] "Bootstrap 95% Confidence Interval: [0.250511789112086, 0.274356540617644]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s02_chr3 : 0.2853185 //n[1] "Bootstrap 95% Confidence Interval: [0.263763371225771, 0.306873555801256]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s03_chr3 : 0.3000388 //n[1] "Bootstrap 95% Confidence Interval: [0.28764506098842, 0.312432447119688]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s04_chr3 : 0.3299265 //n[1] "Bootstrap 95% Confidence Interval: [0.30413250667713, 0.355720496025573]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s05_chr3 : 0.3373121 //n[1] "Bootstrap 95% Confidence Interval: [0.311395306064513, 0.363228831773325]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s06_chr3 : 0.349535 //n[1] "Bootstrap 95% Confidence Interval: [0.321216785475909, 0.377853184794361]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s07_chr3 : 0.3468221 //n[1] "Bootstrap 95% Confidence Interval: [0.32020807718123, 0.373436203899852]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s08_chr3 : 0.3617499 //n[1] "Bootstrap 95% Confidence Interval: [0.328707403073181, 0.394792464494386]"

2.4 F_ROH summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/F_ROH_comparison.txt"
Model F_ROH Lower_CI Upper_CI
Empirical 0.233 NA NA
Neutral 0.240 0.227 0.252
s=0.1 0.262 0.251 0.274
s=0.2 0.285 0.264 0.307
s=0.3 0.300 0.288 0.312
s=0.4 0.330 0.304 0.356
s=0.5 0.337 0.311 0.363
s=0.7 0.347 0.320 0.373
s=0.6 0.350 0.321 0.378
s=0.8 0.362 0.329 0.395

3: Expected Heterozygosity

3.1 Empirical data

3.2 Neutral Model

3.3 Selection Model

3.4 Causative Variant Window

## Point estimate H_e across all 20 simulations for  s01_chr3 : 0.2574516 //n[1] "Bootstrap 95% Confidence Interval: [0.193956857338179, 0.32094628400949]"
## Point estimate H_e across all 20 simulations for  s02_chr3 : 0.218574 //n[1] "Bootstrap 95% Confidence Interval: [0.160994934923244, 0.276153083256386]"
## Point estimate H_e across all 20 simulations for  s03_chr3 : 0.2561486 //n[1] "Bootstrap 95% Confidence Interval: [0.157238173355029, 0.355059094407418]"
## Point estimate H_e across all 20 simulations for  s04_chr3 : 0.3104165 //n[1] "Bootstrap 95% Confidence Interval: [0.256054827329762, 0.364778101728456]"
## Point estimate H_e across all 20 simulations for  s05_chr3 : 0.2762171 //n[1] "Bootstrap 95% Confidence Interval: [0.177205061962401, 0.37522907834675]"
## Point estimate H_e across all 20 simulations for  s06_chr3 : 0.2882735 //n[1] "Bootstrap 95% Confidence Interval: [0.155513720471588, 0.421033222681486]"
## Point estimate H_e across all 20 simulations for  s07_chr3 : 0.309192 //n[1] "Bootstrap 95% Confidence Interval: [0.237226441144795, 0.381157653245608]"
## Point estimate H_e across all 20 simulations for  s08_chr3 : 0.308711 //n[1] "Bootstrap 95% Confidence Interval: [0.221882220268002, 0.395539728972352]"

3.4 Genomewide Average H_e

Model H_e Lower_CI Upper_CI
s=0.6 0.347 0.340 0.354
s=0.4 0.352 0.344 0.359
s=0.5 0.352 0.347 0.357
s=0.8 0.354 0.346 0.362
Empirical 0.355 NA NA
s=0.7 0.356 0.350 0.362
s=0.3 0.358 0.352 0.363
s=0.1 0.360 0.357 0.364
s=0.2 0.360 0.357 0.363
Neutral 0.360 0.356 0.365

3.5 Genomewide 5th percentile comparison - Expected Heterozygosity Summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Expected_Heterozygosity_Summary.txt"
Model H_e_5th_percentile Lower_CI Upper_CI
s=0.6 0.170 0.157 0.182
s=0.4 0.182 0.170 0.194
s=0.5 0.182 0.171 0.193
s=0.7 0.182 0.171 0.194
s=0.8 0.182 0.167 0.197
s=0.3 0.195 0.184 0.207
s=0.2 0.200 0.191 0.209
s=0.1 0.208 0.203 0.213
Empirical 0.209 NA NA
Neutral 0.213 0.205 0.221

4: Summary

4.0 General comparison

4.0.1 ROH-hotspot threshold comparison

## 
##  ROH-hotspot threshold comparison between the different datasets
Model Avg_ROH_hotspot_threshold Lower_CI Upper_CI
Neutral 55.1 49.7 60.5
Empirical 55.4 NA NA
s=0.1 69.1 61.9 76.2
s=0.2 81.9 73.6 90.1
s=0.4 82.2 73.0 91.5
s=0.8 85.3 76.9 93.7
s=0.7 85.8 77.9 93.6
s=0.3 88.6 80.9 96.3
s=0.6 90.9 82.8 99.0
s=0.5 92.0 84.0 100.0

4.0.2 F_ROH comparison

## Models where empirical F_ROH is within CI:
## [1] "Neutral"
## 
## Models where empirical F_ROH is outside CI:
## [1] "s=0.1" "s=0.2" "s=0.3" "s=0.4" "s=0.5" "s=0.6" "s=0.7" "s=0.8"
## png 
##   2
Model F_ROH Lower_CI Upper_CI
Empirical 0.233 NA NA
Neutral 0.240 0.227 0.252
s=0.1 0.262 0.251 0.274
s=0.2 0.285 0.264 0.307
s=0.3 0.300 0.288 0.312
s=0.4 0.330 0.304 0.356
s=0.5 0.337 0.311 0.363
s=0.7 0.347 0.320 0.373
s=0.6 0.350 0.321 0.378
s=0.8 0.362 0.329 0.395

4.1: Hotspot comparison

4.1.1: Selection test (Sweep test)

## [1] "Selection test results"
## [1] "ROH-hotspot windows with an mean H_e Value lower or equal to the lower confidence interval of the fifth percentile of the neutral model are classified as being under selection"
## [1] "5th percentile of the neutral model is: 0.2046964"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.124
Hotspot_chr28_window_1 Yes 0.197
Hotspot_chr24_window_1 No 0.217
Hotspot_chr13_window_1 No 0.225
Hotspot_chr6_window_1 No 0.263
Hotspot_chr15_window_1 No 0.268
Hotspot_chr11_window_2 No 0.284
Hotspot_chr11_window_1 No 0.289
Hotspot_chr8_window_1 No 0.369
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/ROH_hotspots_Selection_testing_neutral_model_H_E_threshold_0.205.csv.txt"
## [1] "ROH-hotspots under selection:"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.124
Hotspot_chr28_window_1 Yes 0.197

4.1.2: Selection Strength Test (Sweep test)

4.1.1.1 Extracting Causative windows under selection

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_05/Causative_windows_under_selection.txt"
Model H_e Lower_CI Upper_CI Under_Selection
Neutral 0.213 0.205 0.221 No
s=0.2 0.219 0.161 0.276 No
s=0.3 0.256 0.157 0.355 No
s=0.1 0.257 0.194 0.321 No
s=0.5 0.276 0.177 0.375 No
s=0.6 0.288 0.156 0.421 No
s=0.7 0.309 0.237 0.381 No
s=0.8 0.309 0.222 0.396 No
s=0.4 0.310 0.256 0.365 No

4.1.1.1 Extracting Hotspots under selection

*** Behöver bootstrap av 5th percentiles från neutral model

Hotspot - Causative Window - Comparison

Model Lengths_Mb ROH_Freq H_e Under_selection
s=0.8 3.62 77.0 0.309 No
s=0.7 4.08 75.2 0.309 No
s=0.6 2.96 86.7 0.288 No
s=0.5 3.24 89.1 0.276 No
s=0.4 2.48 71.6 0.310 No
s=0.3 2.85 83.9 0.256 No
s=0.2 2.56 78.2 0.219 No
s=0.1 2.49 64.2 0.257 No
Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")


# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot displayal
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)



# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)


# # Create the 3D scatter plot
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, # Solid circle
#   xlab = "Lengths",
#   ylab = "ROH Frequency",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# text(s3d.coords$x, s3d.coords$y, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
#      pos = 3, cex = 0.5) # pos=3 means above


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Avg ROH-frequency (%)",
  ylab = "Length (Mb)",
  zlab = "Avg H_e",
  main = plot_title
)

# )

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# Test med skuggor

setwd(output_dir)

# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot display
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

# Generate a color palette for the hotspots
hotspot_models <- unique(Hotspots_and_Causative_windows_comparison_sorted$Model[Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot"])
num_hotspots <- length(hotspot_models)

# Choose the color palette based on the number of hotspots
if (num_hotspots == 2) {
  color_palette_name <- "Set2"
} else {
  color_palette_name <- "Set3"
}

# Get the colors for the hotspots
hotspot_colors <- setNames(brewer.pal(n = num_hotspots, name = color_palette_name), hotspot_models)
## Warning in brewer.pal(n = num_hotspots, name = color_palette_name): minimal value for n is 3, returning requested palette with 3 different levels
# Assign colors to each point
Hotspots_and_Causative_windows_comparison_sorted$Color <- ifelse(
  Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", 
  hotspot_colors[Hotspots_and_Causative_windows_comparison_sorted$Model], 
  "blue"
)

x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
y_axis_label <- "Length (Mb)"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
z_axis_label <- "Avg H_e"


# x_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# x_axis_label <- "Length (Mb)"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# y_axis_label <- "Avg H_e"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# z_axis_label <- "Avg ROH-frequency (%)"


x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
y_axis_label <- "Avg H_e"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
z_axis_label <- "Length (Mb)"

# x_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# x_axis_label <- "Avg H_e"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# y_axis_label <- "Avg ROH-frequency (%)"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# z_axis_label <- "Length (Mb)"

# # Create and save the 3D scatter plot as a PNG file
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080, res = 300)
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png",width = 800, height = 600, res = 300)


# Create the 3D scatter plot
s3d <- scatterplot3d(
  x_value  ,
  y_value,
  z_value,
  color = Hotspots_and_Causative_windows_comparison_sorted$Color,
  pch = 19, # Solid circle
  xlab = x_axis_label,
  ylab = y_axis_label,
  zlab = z_axis_label,
  main = plot_title
)

# Add shadows on the x-y plane (z = 0)
s3d$points3d(
  x_value,
  y_value,
  rep(0, nrow(Hotspots_and_Causative_windows_comparison_sorted)),  # Set z to 0 for shadow
  col = adjustcolor(Hotspots_and_Causative_windows_comparison_sorted$Color, alpha.f = 0.3), # Semi-transparent shadows
  pch = 19
)

# Convert coordinates for adding labels
s3d.coords <- s3d$xyz.convert(
  x_value,
  y_value,
  z_value
)

# Add labels to the original points
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# # Close the graphics device
# dev.off()

Fixar överlappande labels

# # Increase plot size
# par(mar = c(5, 5, 5, 5)) # Adjust margins if needed
# 
# s3d <- scatterplot3d(
#   # x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb ,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   z = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb  ,
# 
#   
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, 
#   # xlab = "Avg ROH-frequency (%)",
#   # ylab = "Length (Mb)",
#   # zlab = "H_e",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "H_e",
#   zlab = "Length (Mb)",
# 
# 
#   
#   main = plot_title
# )
# # # Add labels to the points
# # s3d.coords <- s3d$xyz.convert(
# #   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
# #   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
# #   Hotspots_and_Causative_windows_comparison_sorted$H_e
# # )
# 
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# )
# 
# # 
# # # Add jitter to the label positions
# # text(s3d.coords$x + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      s3d.coords$y + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
# #      pos = 3, cex = 0.5)
# # 
# # 
# text(s3d.coords$x, s3d.coords$y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
#      pos = 3, cex = 0.5) # pos=3 means above

jittering on all axis

# setwd(output_dir)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # windows(width = 1920 / 96, height = 1080 / 96)  # 96 DPI is default
# # # # Create and save the 3D scatter plot as a PNG file
# # png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080)
# 
# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# # Subset data for non-selection points
# non_selection_data <- Hotspots_and_Causative_windows_comparison_sorted[non_selection_indices, ]
# # s3d.coords$x[non_selection_indices]
# # 7.888889 6.278889 6.715556 7.655556 6.674444 5.723333 6.143333 5.773333 6.212222
# # custom_jitter_x_axis_hotspots <- c(0,0,-0.05,0,0.1,-0.4,0,0,0)  # Adjust as needed
# custom_jitter_x_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)  # Adjust as needed
# 
# # s3d.coords$y[non_selection_indices]
# #3.7711111 2.4511111 1.9444444 1.5644444 2.5355556 3.1066667 2.6066667 3.1466667 0.8977778
# # custom_jitter_y_axis_hotspots <- c(0,-0.7,-0.65,0,0,-0.3,0,0.1,0)   # Adjust as needed
# custom_jitter_y_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)   # Adjust as needed
# 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices]+ custom_jitter_x_axis_hotspots, 
#      s3d.coords$y[non_selection_indices]+ custom_jitter_y_axis_hotspots, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices], 
#      pos = 3, cex = 0.7)
# 
# 
# 
# selection_data <- Hotspots_and_Causative_windows_comparison_sorted[selection_indices, ]
# custom_jitter_x_axis_sel_coeff <- c(0.2,0.05,0.025,0,-0.05,0,0)
# # custom_jitter_x_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# custom_jitter_y_axis_sel_coeff <- c(-0.25,0,-0.57,-0.1,-0.6,0,0)
# # custom_jitter_y_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# # Apply jitter to selection point labels
# label_jitter <- runif(sum(selection_indices), 0, 0.2) # Small random jitter
# # Add labels for selection points with jitter
# text(s3d.coords$x[selection_indices] + custom_jitter_x_axis_sel_coeff, 
#      s3d.coords$y[selection_indices] + custom_jitter_y_axis_sel_coeff, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices], 
#      pos = 3, cex = 0.7)

# # Close the graphics device
# dev.off()

# kable(selection_data,row.names = FALSE)
# kable(non_selection_data,row.names = FALSE)

#Jitter z-axis

# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Generate small random jitter for the y-axis (simulating z-axis movement)
# y_jitter <- runif(sum(selection_indices), -0.02, 0.02)
# 
# # Adjust y-coordinates for selection labels only
# jittered_y <- s3d.coords$y[selection_indices] + y_jitter
# 
# # Add labels for selection points with y-axis adjustment
# text(s3d.coords$x[selection_indices],
#      jittered_y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#      pos = 3, cex = 0.7)
# # 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices],
#      s3d.coords$y[non_selection_indices],
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#      pos = 3, cex = 0.7)
# Sort the data frame based on average fixation time, in descending order
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$ROH_Freq), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
7 s=0.5 3.24 89.1 0.276 No
8 s=0.6 2.96 86.7 0.288 No
5 s=0.3 2.85 83.9 0.256 No
4 s=0.2 2.56 78.2 0.219 No
10 s=0.8 3.62 77.0 0.309 No
9 s=0.7 4.08 75.2 0.309 No
2 Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
6 s=0.4 2.48 71.6 0.310 No
3 s=0.1 2.49 64.2 0.257 No
1 Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$H_e), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
6 s=0.4 2.48 71.6 0.310 No
9 s=0.7 4.08 75.2 0.309 No
10 s=0.8 3.62 77.0 0.309 No
8 s=0.6 2.96 86.7 0.288 No
7 s=0.5 3.24 89.1 0.276 No
3 s=0.1 2.49 64.2 0.257 No
5 s=0.3 2.85 83.9 0.256 No
4 s=0.2 2.56 78.2 0.219 No
2 Hotspot_chr28_window_1 1.40 75.0 0.197 Yes
1 Hotspot_chr1_window_1 0.20 61.9 0.124 Yes
# # Load the required packages
# library(rgl)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # Open a new 3D plotting device
# open3d()
# 
# # Plot with rgl
# plot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   col = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   type = 's', # Use points
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d_coords <- cbind(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Add labels for non-selection points with custom jitter
# text3d(
#   s3d_coords[non_selection_indices, ] + cbind(custom_jitter_x_axis_hotspots, custom_jitter_y_axis_hotspots, 0),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#   col = "red", # Color of the text
#   cex = 0.7
# )
# 
# # Add labels for selection points with custom jitter
# text3d(
#   s3d_coords[selection_indices, ] + cbind(custom_jitter_x_axis_sel_coeff, custom_jitter_y_axis_sel_coeff, label_jitter),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#   col = "blue", # Color of the text
#   cex = 0.7
# )

# # Save the 3D plot as a PNG file
# rgl.postscript("3dplot_Hotspot_Causative_Window_Comparison.png", fmt = "png")

# # Close the 3D device
# rgl.close()

Visualizing and scaling

# Min-max-scaling normalization function, that normalizes a vector of values
min_max_normalization_fun <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
### Min max scaling ###
# Normalize Lengths_Mb
Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb)
# Normalize ROH Frequency
Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq)
# Normalize H_e
Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$H_e)

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

plot_title <- paste("Normalized ROH Hotspot(s) & Causative Windows comparison")


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Normalized Lengths",
  ylab = "Normalized Mean ROH Frequency",
  zlab = "Normalized H_e",
  main = plot_title
)

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

5 Visualizing Expected Heterozygoisty

5.1 Bootstrap CI for mean 5th percentile H_e

## Models where empirical H_e is within CI for Hotspot_chr1_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr1_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr28_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr28_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr24_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr24_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr13_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr13_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr6_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr6_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr15_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr15_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr11_window_2 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_2 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr11_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_1 :
## character(0)

## Models where empirical H_e is within CI for Hotspot_chr8_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr8_window_1 :
## character(0)

5.2 5th percentile of the extreme H_e values